cd [put directory here]

**********************
** This program parses the Library of Congress 2016 catalog data 
* See https://www.loc.gov/item/2020445551 


***********************************************
* read the raw files 1-9 
forvalues j = 1 (1) 9 {

* get start and end numbers 
 import delimited Books.All.2016.part0`j'.xml,  clear

	local max=_N 
	local T = int(`max'/2000000)
	local T1 = `T' + 1 

	forvalues k = 1 (1) `T1' { 
		local start = max((`k'-1)*2000000  - 1000,1)
		local end = min(`k'*2000000,`max')

		

import delimited data\Books.All.2016.part0`j'.xml, encoding(UTF-8)  rowrange(`start':`end') clear

	gen v=substr(v1,1,100)
	drop v1 
	split v, parse("field" ">" "<")
	
	gen value=v4 
	gen name = "leader" if v2=="leader"
	replace value = v3 if v2=="leader"
	replace name = "tag001" if  index(v2,"control")>0 & index(v3,"001")>0
	replace name = "tag003" if  index(v2,"control")>0 & index(v3,"003")>0
	replace name = "tag005" if  index(v2,"control")>0 & index(v3,"005")>0
	gen dnext=v2=="/data"
	gen cnext=sum(dnext)

	split v3 if v2=="data", parse(" ind")
	gen prefix=v31 
	replace prefix=prefix[_n-1] if prefix=="" & prefix[_n-1]~=""	

	replace name = prefix + v3 if v2=="sub"
	replace name= prefix if name==""
	replace name= v3 if v2=="control" & name==""


	replace name=subinstr(name,char(34),"",.)
	replace name=subinstr(name," ","",.)
	replace name=subinstr(name,"=","",.)

	 
	keep if (name=="leader" | name=="tag001" | name=="tag003" | name=="tag005") | ///
	index(name,"tag260")>0 | index(name,"tag650")>0 | index(name,"tag651")>0 | index(name,"tag100")>0 |  index(name,"tag043")>0 | ///
	index(name,"tag050")>0


	gen new=v2=="leader"
	gen id=sum(new)
	drop if id==0
	egen maxid=max(id)
	drop if id==maxid

	 keep v value name id 
	 drop if name==""
	 drop if value==""
	 
	 bysort id name: gen n=_n
	 drop v 
	 rename value v 
	 keep v id n name 
	  recast strL v*
	 reshape wide  v, i(id n) j(name) string
	 
	 drop if vleader==""
 
 save data\temp_`j'_`k'.dta, replace 
}

}


*******************************************
* get 10-43 
forvalues j = 10 (1) 43 {


import delimited Books.All.2016.part`j'.xml,  clear

local max=_N 
local T = int(`max'/2000000)
local T1 = `T' + 1 

forvalues k = 1 (1) `T1' { 
	local start = max((`k'-1)*2000000  - 1000,1)
	local end = min(`k'*2000000,`max')

		


import delimited data\Books.All.2016.part`j'.xml,  encoding(UTF-8) rowrange(`start':`end') clear

	gen v=substr(v1,1,100)
	drop v1 
	split v, parse("field" ">" "<")
	



	gen value=v4 
	gen name = "leader" if v2=="leader"
	replace value = v3 if v2=="leader"
	replace name = "tag001" if  index(v2,"control")>0 & index(v3,"001")>0
	replace name = "tag003" if  index(v2,"control")>0 & index(v3,"003")>0
	replace name = "tag005" if  index(v2,"control")>0 & index(v3,"005")>0
	gen dnext=v2=="/data"
	gen cnext=sum(dnext)

	split v3 if v2=="data", parse(" ind")
	gen prefix=v31 
	replace prefix=prefix[_n-1] if prefix=="" & prefix[_n-1]~=""	

	replace name = prefix + v3 if v2=="sub"
	replace name= prefix if name==""
	replace name= v3 if v2=="control" & name==""


	replace name=subinstr(name,char(34),"",.)
	replace name=subinstr(name," ","",.)
	replace name=subinstr(name,"=","",.)

	 
	keep if (name=="leader" | name=="tag001" | name=="tag003" | name=="tag005") | ///
	index(name,"tag260")>0 | index(name,"tag650")>0 | index(name,"tag651")>0 | index(name,"tag100")>0 |  index(name,"tag043")>0 | ///
	index(name,"tag050")>0


	gen new=v2=="leader"
	gen id=sum(new)
	drop if id==0
	egen maxid=max(id)
	drop if id==maxid

	 keep v value name id 
	 drop if name==""
	 drop if value==""
	 
	 bysort id name: gen n=_n
	 drop v 
	 rename value v 
	 keep v id n name 
	  recast strL v*
	 reshape wide  v, i(id n) j(name) string
	 
	 drop if vleader==""
 
 save data\temp_`j'_`k'.dta, replace 
}

}
